********************************************************************************
* 
* Generating Robust ID and addressing duplicates 
* 
********************************************************************************

clear all 

use "$data\registration_voting_outcomes_withxwalk.dta" 
keep if mci_uniq_id!=. //keep only merged observations as a result 
compress 
duplicates report voter_id //no duplicates in terms of voter id
split voter_id, parse("-") 
keep mci_uniq_id voter_id1 *name* *dob* 
bys voter_id1 mci_uniq_id: keep if _n==1 
duplicates tag voter_id1, gen(dup)
tab dup
*Creating list of mci_uniq_ids that are duplicates according to voter_id1
preserve 
	keep if dup>0 
	keep voter_id1 mci_uniq_id 
	bys voter_id1 (mci_uniq_id): gen count=_n
	count if mci_uniq_id==. 
	reshape wide mci_uniq_id, i(voter_id1) j(count)
	count if mci_uniq_id1>mci_uniq_id2 //the 2 is always the highest of the two mci_uniq_id's (for matching)
	rename mci_uniq_id1 mci_uniq_id 
	keep mci_uniq_id mci_uniq_id2
	tempfile voteriddups 
	save "`voteriddups'"
restore 
/* the state is appending those two-digit values at the end of the voterid to indicate new/duplicated voter registrations across counties */ 

duplicates report mci
distinct mci //200k approx of voter IDs - 13% of voter ids matched - are matched to an mci uniq ID that matched with another voter ID! 
keep mci_uniq_id 
duplicates drop 
tempfile idoutcomes 
save "`idoutcomes'"

********************
* 	ID file  
********************
import excel using "$data\Raw\allegheny_raw_25nov19\MCI matching file Updated.xlsx", firstrow case(l) clear sheet("Sheet 1") allstring 

*clean date variable 
split dob, parse(" ")
gen dob_mdy=date(dob1,"DMY")
format dob_mdy %td
drop dob1 dob2 dob 

gen everadult=td(3nov2002)>=dob_mdy //last election date in same is Nov 3 2020, hence this restriction  

*clean names 
gen fullname=fname+lname 

cap drop unknown 
gen unknown=strpos(fullname, "UNKNOWN")>0
replace unknown=1 if fullname=="" & unknown==0 
replace unknown=1 if fullname=="UNIDENTIFIED" & unknown==0
replace unknown=1 if fullname=="UNIKNOWN" & unknown==0 
replace unknown=1 if fullname==".." & unknown==0 
replace unknown=1 if substr(fname,1,3)=="UNK" 
replace unknown=1 if (substr(fname,1,1)=="("|substr(fname,1,1)=="/"|substr(fname,1,1)=="0"|substr(fname,1,1)=="1"|substr(fname,1,1)=="2"|substr(fname,1,1)=="3"|substr(fname,1,1)=="4"|substr(fname,1,1)=="5"|substr(fname,1,1)=="7"|substr(fname,1,1)=="8") & unknown==0 
replace unknown=strpos(fullname,"VOID")>0 if unknown==0 
replace unknown=strpos(fullname,"VOID")>0 if unknown==0 


replace unknown=strpos(fullname,"ANONYMOUS")>0 if unknown==0 
replace unknown=1 if fullname=="" 
tab unknown

cap drop terminated 
gen terminated=strpos(fullname,"TERMINATE")>1 
replace terminated=strpos(fullname, "TPR")>1 if terminated==0 
replace terminated=1 if terminated==0 & (fname=="PARENTAL RIGHTS"|fname=="PARENT") 

label var unknown "Unknown name"
label var terminated "TPR"

*formatting 
destring mci_uniq_id mci_id client_id, replace 
count if mci_id==. //many missing mci_id because need a certain number of demographic information to be able to create an mci_id, including notably date of birth 
format mci_uniq_id %12.0g 
format mci_id %12.0g 

drop fullname 
tempfile tempid 
save "`tempid'" 

preserve 
drop *name dob_mdy everadult 
duplicates drop 
save "$data\Processed\MCImatchingfileupdated", replace 
restore 

************************************************
* Creating flag for matching to voter_id 
************************************************
merge m:1 mci_uniq_id using "`idoutcomes'"
drop if _merge==2 //individuals that exist in the data warehouse and have an mci_uniq_id but not involved with CPS in this time period. 
distinct mci_uniq_id 
distinct mci_uniq_id if _merge==3  

************************ 
* Addressing duplicates and generating one robust ID for all clients 
************************
duplicates tag mci_uniq_id, gen(dup)
tab dup unknown 
tab dup terminated if unknown!=1 

tab unkn _merge 
tab termina _merge //most terminated and unknown are not matching to a voter id

* Should not be merging duplicate cases that are not matched to an outcomes ID that is not for a real person 
preserve
	keep if terminated==1 | unknown==1  
	keep client_id unknown terminated 
	duplicates drop //client_id is unique in this dataset 
	save "$data\processed\IDtprORunk", replace 
restore 
drop if terminated==1 | unknown==1 

*** Step 0: merging mci_uniq_ids that have the same voter_id 
merge m:1 mci_uniq_id using "`voteriddups'", gen(_mvoter) //only 24 obs relevant to CPS 
drop if _mvoter==2 
rename mci_uniq_id2 mci_uniq_id1 
format mci_uniq_id1 %12.0g 
replace mci_uniq_id1=mci_uniq_id if mci_uniq_id1==. 
drop _mvoter 

*** Step 1: If there are individuals with a client ID that has sometimes an mci_uniq_id that merges and another that doesn't, then keep the mci_uniq_id that merges 
bys client_id: egen anymerged=max(_merge)
by client_id: egen anynotmerged=min(_merge)
tab _merge anymerged //123 cases where the mci_uniq_id did not match but there is the same cl_id with another mci_uniq_id that matched 
duplicates report client_id if anymerged==3 & _merge==1 //unique cl_id's 
preserve
	keep if  anymerged==3 & anynotmerged==1
	bys client_id (_merge mci_uniq_id1): gen long mci_uniq_id2=mci_uniq_id1[_N]
	keep if mci_uniq_id2!=mci_uniq_id1
	tempfile data 
	save "`data'"
restore 
merge m:1 client_id mci_uniq_id1 using "`data'",gen(check)
replace mci_uniq_id2=mci_uniq_id1 if check!=3 
drop check  
format mci_uniq_id2 %12.0g 

*** Step 2 : Since I now have, by client id, an mci_uniq_id that either always merges or never merges, then I can take the highest number among those 
bys client_id (mci_uniq_id2): gen long mci_uniq_id3 = mci_uniq_id2[_N] 
count if mci_uniq_id3!=mci_uniq_id2 //115 changes of clients who had the same client_id but a different mci_uniq_id 
format mci_uniq_id3 %12.0g 
//Note - CPS sometimes accidentally associated two different individuals (with the same mci_uniq_id) to the same client_id. 

*** Step 3: I now want to do the same thing for mci_id's: if a given mci_id has multiple mci_uniq_id's affiliated to it, then I want to pick the mci_uniq_id that is merged 
sort mci_id 
by mci_id: egen anymerged2=max(anymerged) if mci_id!=. 
by mci_id: egen anynotmerged2=min(anynotmerged) if mci_id!=. 
tab anymerged2 anynotmerged2, m // 980 that had some mci_id that merged in some cases but not all 
duplicates report mci_id if anynotmerged2==1 & anymerged2==3 
preserve
	keep if  anymerged2==3 & anynotmerged2==1
	sort mci_id _merge mci_uniq_id3 
	by mci_id: gen long mci_uniq_id4=mci_uniq_id3[_N]
	keep if mci_uniq_id4!=mci_uniq_id3
	keep mci_id mci_uniq_id4 mci_uniq_id3
	duplicates drop 
	cap tempfile data 
	save "`data'", replace 
restore 
merge m:1 mci_id mci_uniq_id3 using "`data'",gen(check)
replace mci_uniq_id4=mci_uniq_id3 if check!=3 
drop check anymerged anynotmerged
format mci_uniq_id4 %12.0g 


*** Step 4 : Among these new ID's, then find the max mci_uniq_id's 
preserve
keep if mci_id==. 
tempfile data 
save "`data'", replace 
restore 
drop if mci_id==. 
bys mci_id (mci_uniq_id4): gen long mci_uniq_id5 = mci_uniq_id4[_N] 

count if mci_uniq_id5!=mci_uniq_id3 //897 changes were made 
append using "`data'"
replace mci_uniq_id5=mci_uniq_id4 if mci_id==. 
format mci_uniq_id5 %12.0g 

*** Step 5: Need to keep doing these things until nothing changes 
*#1 (mci_uniq)
bys mci_uniq_id: egen anymerged3=max(anymerged2)
tab anymerged3 anymerged2, m  
sort mci_uniq_id anymerged3 anymerged2 _merge mci_uniq_id5 
by mci_uniq_id: gen long mci_uniq_id6 = mci_uniq_id5[_N] 
count if mci_uniq_id5!=mci_uniq_id6 //307 changes were made 
*#2 (mci)
bys mci_id: egen anymerged4=max(anymerged3)
preserve
	keep if mci_id==. 
	tempfile data
	save "`data'", replace 
restore 
drop if mci_id==. 
sort mci_id anymerged4 anymerged3 anymerged2 _merge mci_uniq_id6 
by mci_id: gen long mci_uniq_id7 = mci_uniq_id6[_N] 
append using "`data'"
replace mci_uniq_id7=mci_uniq_id6 if mci_id==. 
count if mci_uniq_id7!=mci_uniq_id6 & mci_id!=. //3 changes were made 
*#3 (mci_uniq)
bys mci_uniq_id: egen anymerged5=max(anymerged4)
sort mci_uniq_id anymerged5 anymerged4 anymerged3 anymerged2 _merge mci_uniq_id7 
by mci_uniq_id: gen long mci_uniq_id8 = mci_uniq_id7[_N] 
count if mci_uniq_id7!=mci_uniq_id8  //1 changes were made 
*#4 checked no more changes for mci_uniq_id nor cl_id! 
bys mci_id: egen anymerged6=max(anymerged5)
preserve
	keep if mci_id==. 
	tempfile data
	save "`data'", replace 
restore 
drop if mci_id==. 
sort mci_id anymerged6 anymerged5 anymerged4 anymerged3 anymerged2 _merge mci_uniq_id8 
by mci_id: gen long mci_uniq_id9 = mci_uniq_id8[_N] 
count if mci_uniq_id9!=mci_uniq_id8 //no changes anymore 
append using "`data'"
drop mci_uniq_id9 anymerged6 
bys mci_uniq_id: egen anymerged6=max(anymerged5)
sort mci_uniq_id anymerged6 anymerged5 anymerged4 anymerged3 anymerged2 _merge mci_uniq_id8 
by mci_uniq_id: gen long mci_uniq_id9 = mci_uniq_id8[_N] 
count if mci_uniq_id9!=mci_uniq_id8  //no changes anymore 
drop mci_uniq_id9 anymerged6 


*** Step 6 - I want to check all these duplicates which I generated to make sure they seem ok.
*checking whether everadult is constant with an individual (it should be conditional on the DOB being the same, but the later is not always true)
*clean date variable 
bys mci_uniq_id8: egen mn=min(everadult)
bys mci_uniq_id8: egen mx_everadult=max(everadult)
tab mn mx, m //they don't always match, so use the maximum. 
*Preparation to output data to look at 
drop mn  
preserve
keep if mci_uniq_id1!=mci_uniq_id8 
keep mci_uniq_id8 
duplicates drop 
tempfile data 
save "`data'", replace
restore 
//want to keep main obs that didn't change too
preserve 
merge m:1 mci_uniq_id8 using "`data'", gen(tocheck) 
keep if tocheck==3 
drop if mx_everadult!=1
drop tocheck 
order mci_uniq_id8 mci_uniq_id1 mci_id client_id fname lname dob_mdy _merge  
keep mci_uniq_id8 mci_uniq_id1 mci_id client_id fname lname dob_mdy _merge 
save "`data'", replace 
//creating list with 
keep mci_uniq_id8 fname lname dob_mdy  
bys mci_uniq_id8: gen nbobs=_N
duplicates drop 
duplicates tag mci_uniq_id8, gen(dup) 
tab dup 
keep if dup==0 
keep mci_uniq_id8 
duplicates drop 
tempfile noneedforhandcheck 
save "`noneedforhandcheck'"
use "`data'", clear 
merge m:1 mci_uniq_id8 using "`noneedforhandcheck'", gen(_clear)
drop if _clear==3 
drop _clear 
gen voterdata="Yes" if _merge==3 
drop _merge 
duplicates drop 
duplicates report mci_uniq_id1 mci_id client_id //not such a big deal so keeping all in case the names/DOB change substantially from one obs to the next 
dis _N 
distinct mci_uniq_id8 //694 
sort mci_uniq_id8 mci_uniq_id1 mci_id
//export excel "$data\DuplicatestoCheck.xlsx", replace firstrow(var) 
restore 


*Merging in Hand Corrections 
/*We have not provided the "DuplicatestoCheckChecked.xlsx" which records IDs of individuals that our code #2 would have merged together as duplicates but where the names, date of birth and gender across duplicates did not perfectly match up. These observations were hand checked using the PII information to determine whether or not the individuals should indeed be matched or retain another ID. Please reach out to us if you want these manual coding (changed IDs applies to only 182 individuals). */ 
preserve
import excel using  "$data\DuplicatestoCheckChecked", clear firstrow
format mci_uniq_id1 mci_id client_id mci_uniq_id8 correctedid %12.0g 
tempfile checked
save "`checked'"
restore  
merge 1:1 mci_uniq_id8 mci_uniq_id1 mci_id client_id fname lname dob_mdy using "`checked'", gen(check)

 
gen handchecked="Not Corrected" if check==3 & correctedid==. 
replace handchecked="Corrected" if check==3 & correctedid!=. 
drop check 
label var handchecked "Hand checked duplicates"
tab handchecked //only 9.6% needed to be corrected 

*Making appropriate changes for the final ID 
gen long mci_uniq_id9=mci_uniq_id8 
replace mci_uniq_id9=correctedid if handchecked=="Corrected" //181 changes made  
label var mci_uniq_id9 "ID for Outcomes after Doing a Final Manual Check"
drop correctedid voterdata  

//append using "$data\processed\IDtprORunk"


**** Saving Crosswalk file for IDs 

keep mci_uniq_id mci_id client_id handchecked mci_uniq_id9 
duplicates drop 
duplicates tag mci_uniq_id mci_id client_id, gen(dup)
drop if dup>0 & handchecked=="Not Corrected" 
drop dup 

save "$data\Processed\IDmasterfile", replace 


